/* This do-file defines some key variables in an harmonized way across the 
three data points (2001, 2007, 2011), and creates the final dataset to be used  
for the analysis. Before running this, the ad hoc do files for census 2001&2007 and community 
survey 2007 have to be run. */ 

set more off 

cd "" /*input directory here with data census_2001_wber, community_survey_2007_wber, census_2011_wber */


*******************
/***** 2001 ******/ 
*******************

use "south_africa_census_2001_wber.dta", clear

***********************************
** Defining Labour Market Status ** 
***********************************

** Activity Dummy ** 

gen activity=1 if (status==1 | status==2 | status==3)
replace activity=0 if status==4

** Employment Dummy ** 

gen emp=1 if (status==1 | status==2) 
replace emp=0 if (status==3 | status==4)

** Unemployment Dummy ** 

gen unemployed=(status==3)

** Self-Employment Dummy ** 

gen self=(status==2)

**  Other observables ** 

* Black * 

gen black=(race==1) 

* Married *

gen married=(marital_status==1 | marital_status==2 | marital_status==3 | marital_status==4)

* Born in South Africa *  

gen born_sa=(province_birth!=99) 

* Cohort * 

gen cohort=birth_year 

*******************************
* Youngest Child Birth Cohort * 
*******************************

replace yc_birth_year=. if yc_birth_year>2001
replace yc_birth_month=. if yc_birth_month>12

gen child_cohort=yc_birth_year 


*******************************
* Household Level Observables * 
*******************************

* Household Employed * 

bys household_id: egen hh_emp=sum(emp) 

* Household School Attend * 

bys household_id: egen hh_school_attend=sum(school_attend)

* Household Human Capital * 

bys household_id: egen hh_hcap=mean(yrs_schooling)

* Household Age * 

bys household_id: egen hh_age=mean(age)

* Household Adult Males * 

gen a_male=(sex==1 & age>18)

bys household_id year: egen hh_a_male=sum(a_male)

* Household Number Children * 

gen children=(age<18)

bys household_id year: egen hh_children=sum(children)

** Exclude Unrealistic Values ** 

drop if household_size>=50 /*excludes household dwellings that are unrealistically large */ 


***************************************

save temporary_census_2001.dta, replace


*******************
/***** 2007 ******/ 
*******************

use "south_community_2007_wber.dta", clear

***********************************
** Defining Labour Market Status ** 
***********************************

** Activity Dummy ** 

gen activity=1 if (status==1 | status==2 | status==3)
replace activity=0 if status==4
replace activity=. if status==.

** Employment Dummy ** 

gen emp=1 if (status==1 | status==2) 
replace emp=0 if (status==3 | status==4)
replace emp=. if status==.

** Unemployment Dummy ** 

gen unemployed=(status==3)

** Self-Employment Dummy ** 

gen self=(status==2)

gen occ_missing=(occupation==9999 & emp==1) 
gen ind_missing=(industry==9999 & emp==1) 

replace industry=10 if industry==0

**  Other observables ** 

* Black * 

gen black=(race==1) 

* Married * 

gen married=(marital_status==1 | marital_status==2 | marital_status==3 | marital_status==4)

* Born in South Africa *  

gen born_sa=(province_birth!=10) 

* Income category * 

replace income_category=. if income_category==13 

* Cohort * 

gen cohort=year - age /*year of birth information not available in the 2007 community survey */ 

*******************************
* Youngest Child Birth Cohort * 
*******************************

replace yc_birth_year=. if yc_birth_year>2007
replace yc_birth_month=. if yc_birth_month>12

gen child_cohort=yc_birth_year 


*******************************
* Household Level Observables * 
*******************************

* Household Employed * 

bys household_id: egen hh_emp=sum(emp) 

* Household School Attend * 

bys household_id: egen hh_school_attend=sum(school_attend)

* Household Human Capital * 

bys household_id: egen hh_hcap=mean(yrs_schooling)

* Household Age * 

bys household_id: egen hh_age=mean(age)

* Household Adult Males * 

gen a_male=(sex==1 & age>18)
bys household_id year: egen hh_a_male=sum(a_male)

* Household Number Children  * 

gen children=(age<18)
bys household_id year: egen hh_children=sum(children)


save temporary_community_2007.dta, replace


*******************
/***** 2011 ******/ 
*******************

use "south_africa_census_2011_wber.dta", clear

***********************************
** Defining Labour Market Status ** 
***********************************

** Activity Dummy ** 

gen activity=1 if (status==1 | status==2 | status==3)
replace activity=0 if status==4

** Employment Dummy ** 

gen emp=1 if (status==1 | status==2) 
replace emp=0 if (status==3 | status==4)

** Unemployment Dummy ** 

gen unemployed=(status==3)

** Self-Employment Dummy ** 

gen self=(status==2) 

** Other observables ** 

* Black * 

gen black=(race==1) 

* Married * 

gen married=(marital_status==1 | marital_status==2)


* Born in South Africa *  

gen born_sa=(province_birth!=10) 

* Income category * 

replace income_category=. if income_category==99 


* Cohort * 

gen cohort=birth_year 


*******************************
* Youngest Child Birth Cohort * 
*******************************

replace yc_birth_year=. if yc_birth_year>2011
replace yc_birth_month=. if yc_birth_month>12
 
gen child_cohort= yc_birth_year 


*******************************
* Household Level Observables * 
*******************************

* Household Employed * 

bys household_id: egen hh_emp=sum(emp) 


* Household School Attend * 

bys household_id: egen hh_school_attend=sum(school_attend)

* Household Human Capital * 

bys household_id: egen hh_hcap=mean(yrs_schooling)

* Household Age * 

bys household_id: egen hh_age=mean(age)

* Household Adult Males * 

gen a_male=(sex==1 & age>18)

bys household_id year: egen hh_a_male=sum(a_male)

* Household Number Children * 

gen children=(age<18)

bys household_id year: egen hh_children=sum(children)

* N.B: careful that the district  variable changes across years. Use ONLY within year, or use conversion table to revert to 2005 numbering */ 

save temporary_census_2011.dta, replace

**************************************************************************************
*********************
**  Append 3 waves **
*********************

clear all 

use  temporary_census_2001.dta, clear

keep household_id id year cohort municipality district child_cohort number_children  yc_birth_month child_alive age race sex province province_birth yrs_schooling activity emp unemployed self occupation industry occ_specific household_size married weight born_sa income_category hh_emp hh_school_attend hh_hcap hh_age hh_a_male hh_children hh_rooms hh_tenure house_type fridge radio tv computer cell_phone hh_water electricity h_toilet  /* Keep variables relevant to the anaylsis */ 

keep if child_cohort!=. /* Keep only mothers */ 

append using temporary_community_2007.dta

keep household_id id year cohort municipality district child_cohort number_children yc_birth_month child_alive age race sex province province_birth yrs_schooling activity emp unemployed self occupation industry occ_specific household_size married weight born_sa income_category hh_emp hh_school_attend hh_hcap hh_age hh_a_male hh_children hh_rooms hh_tenure house_type fridge radio tv computer cell_phone hh_water electricity h_toilet /* Keep variables relevant to the anaylsis */ 

keep if child_cohort!=. /* Keep only mothers.. */ 

append using temporary_census_2011.dta 

keep household_id id year cohort municipality district child_cohort number_children yc_birth_month child_alive age race sex province province_birth yrs_schooling  activity emp unemployed self occupation occ_specific occ_most_specific industry ind_specific household_size married weight born_sa income_category hh_emp hh_school_attend hh_hcap hh_age hh_a_male hh_children hh_rooms hh_tenure house_type fridge radio tv computer cell_phone hh_water electricity h_toilet  /* Keep variables relevant to the anaylsis */ 

keep if child_cohort!=. /* Keep only mothers.. */ 


********************************************************************************

drop if child_alive==2 /* keep mothers who's youngest child is still alive */
keep if born_sa==1 /* Only keep people born in South Africa, not clear if eligible to grant otherwise */ 


keep if cohort>=1960 & cohort<=1985  /* Only keep cohorts of mothers that can be observed over the entire period 2001-2011, given that the Census is censored at age 50 and labour market information is collected for individuals older than 15. See text for explanation */ 

duplicates drop household_id id year, force /* Exclude observations without unique id in the same year */ 

***********************************************************************************

save appended_census_mothers.dta, replace  /*save working dataset for estimation */
